# -*- coding: utf-8 -*-
import scrapy
import json
from xiangmu.items import DoubanItem


class DoubanSpider(scrapy.Spider):
    name = 'newvideo'
    allowed_domains = ['365yg.com']
    #请求头 去setting 赋值  这里需要注意 DEFAULT_REQUEST_HEADERS需要解开注释 并改成很多冒号
    custom_settings={
          'DEFAULT_REQUEST_HEADERS' : {
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
          'Accept-Language': 'en',
          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
        }

    }
    # start_urls = ['http://douban.com/']
    # 使用post 的时候 需要改变url 变成start_requests
    def start_requests(self):

        url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start={}'
        page = 3
        for i in range(1,page):
            url = url.format(i * 20)
            req = scrapy.Request(url=url,callback=self.parse)
            yield req

    def parse(self, response):
        # print(response.body.decode('utf-8'))
        json_dict = response.body.decode('utf-8')
        json_dict1 = json.loads(json_dict)
        for item in json_dict1['subjects']:
            url = item['url']
            yield scrapy.Request(url=url,callback=self.parse_page)

    def parse_page(self,response):
        title = response.xpath('//h1/span[1]/text()').extract()[0]
        uper_time = response.xpath('//h1/span[2]/text()').extract()[0][1:-1]
        daoyan = response.xpath('//a[@rel="v:directedBy"]/text()').extract_first()# 导演
        zhuyan_list = response.xpath('//div[@id="info"]/span[3]/span[2]/a/text()').extract()[0]# 主演
        type = response.xpath('//div[@id="info"]/span[6]/text()').extract()[0] #类型
        all_time =  response.xpath('//div[@id="info"]/span[12]/text()').extract()[0] #总时长
        diqu = response.xpath('//div[@id="info"]/span[10]/text()').extract()[0][11:-1] #地区
        photo = response.xpath('//img[@rel="v:image"]/@src').extract_first()
        neirong = response.xpath('//div[@class="indent"]/span/text()').extract()[0].strip()

        item = DoubanItem()
        item['title'] = title
        item['uper_time'] = uper_time
        item['daoyan'] = daoyan
        item['zhuyan_list'] = ''.join(zhuyan_list)
        item['type'] = type
        item['all_time'] = all_time
        item['diqu'] = diqu
        item['photo'] = photo
        item['neirong'] = neirong
        item['image_urls'] = [photo]
        yield item


